library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(Metrics)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(corrplot)
## corrplot 0.92 loaded
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caTools)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(class)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
#library(DescTools)
rm(list=ls()) # Remove anything that might currently be in memory so we can start fresh.
library(MASS) # Load the MASS package
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
BostonHousing <- read.csv("Boston.csv")
colnames(BostonHousing) <- c("CrimePerCapita","Residential","NonRetail","CharlesRiver","N_Oxides",
"RoomsPerDwelling","Pre40HouseAge","BusinessCenterProximity",
"HighwayProximity","Tax","PupilPerTeacher","BlackIndex",
"LowStatus","HomeValue" )
print(head(BostonHousing))
## CrimePerCapita Residential NonRetail CharlesRiver N_Oxides RoomsPerDwelling
## 1 0.00632 18 2.31 0 0.538 6.575
## 2 0.02731 0 7.07 0 0.469 6.421
## 3 0.02729 0 7.07 0 0.469 7.185
## 4 0.03237 0 2.18 0 0.458 6.998
## 5 0.06905 0 2.18 0 0.458 7.147
## 6 0.02985 0 2.18 0 0.458 6.430
## Pre40HouseAge BusinessCenterProximity HighwayProximity Tax PupilPerTeacher
## 1 65.2 4.0900 1 296 15.3
## 2 78.9 4.9671 2 242 17.8
## 3 61.1 4.9671 2 242 17.8
## 4 45.8 6.0622 3 222 18.7
## 5 54.2 6.0622 3 222 18.7
## 6 58.7 6.0622 3 222 18.7
## BlackIndex LowStatus HomeValue
## 1 396.90 4.98 24.0
## 2 396.90 9.14 21.6
## 3 392.83 4.03 34.7
## 4 394.63 2.94 33.4
## 5 396.90 5.33 36.2
## 6 394.12 5.21 28.7
This data is taken from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970. It was uploaded to Kaggle.com by Patrick Parsa.
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ CrimePerCapita : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ Residential : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ NonRetail : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ CharlesRiver : int 0 0 0 0 0 0 0 0 0 0 ...
## $ N_Oxides : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ RoomsPerDwelling : num 6.58 6.42 7.18 7 7.15 ...
## $ Pre40HouseAge : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ BusinessCenterProximity: num 4.09 4.97 4.97 6.06 6.06 ...
## $ HighwayProximity : int 1 2 2 3 3 3 5 5 5 5 ...
## $ Tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ PupilPerTeacher : num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ BlackIndex : num 397 397 393 395 397 ...
## $ LowStatus : num 4.98 9.14 4.03 2.94 5.33 ...
## $ HomeValue : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
stargazer(BostonHousing, type = 'text', title='BostonHousing: Summary Statistics')
##
## BostonHousing: Summary Statistics
## ===========================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------
## CrimePerCapita 506 3.614 8.602 0.006 88.976
## Residential 506 11.364 23.322 0.000 100.000
## NonRetail 506 11.137 6.860 0.460 27.740
## CharlesRiver 506 0.069 0.254 0 1
## N_Oxides 506 0.555 0.116 0.385 0.871
## RoomsPerDwelling 506 6.285 0.703 3.561 8.780
## Pre40HouseAge 506 68.575 28.149 2.900 100.000
## BusinessCenterProximity 506 3.795 2.106 1.130 12.126
## HighwayProximity 506 9.549 8.707 1 24
## Tax 506 408.237 168.537 187 711
## PupilPerTeacher 506 18.456 2.165 12.600 22.000
## BlackIndex 506 356.674 91.295 0.320 396.900
## LowStatus 506 12.653 7.141 1.730 37.970
## HomeValue 506 22.533 9.197 5.000 50.000
## -----------------------------------------------------------
There are 506 rows and 14 columns. Each row represents a different town or suburb observed at the time by the SMSA. Each column represents different locality characteristics within each town deemed noteworthy by the SMSA. These columns would represent the variables of the dataset. All the column have numeric values with a few columns containing pure integers as a representation of categorical variables. We are going to build a model to predict the median home value within these localities.
HomeValue is our target variable. HomeValue describes the median value of the homes in each town or suburb observed within this dataset. Every other variable is considered an independent variable that will help in predicting the HomeValue in our model. All values in HomeValue are recorded in units of $1000.
With HomeValue as our target variable, we are left with 13 independent variables:
CrimePerCapita : The crime per capita by town
Residential : The proportion of residential land zoned
for lots over 25,000 square feet.
NonRetail :
The proportion of non-retail business acres per town.
CharlesRiver : This is a dummy variable to record
whether an area of land is bounded by the Charles River. 1 for yes, 0
for no.
N_Oxides : The nitric oxide
concentration recorded in parts per 10 million.
RoomsPerDwelling : The average rooms per home for the
town.
Pre40HouseAge : The proportion of
owner-occupied units built prior to 1940.
BusinessCenterProximity : The weighted distances to
five Boston employment centers.
HighwayProximity
: Index of accessibility to highways leading to or from an urban center.
Tax : The full-value property tax rate per
$10,000.
PupilPerTeacher : The student-teacher
ratio by town.
BlackIndex : The proportion of
black people by town.
LowStatus : The percentage
of lower status of the population.
# Visualizing our dependent variable y(HomeValue)
ggplot(BostonHousing, aes(x = HomeValue)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The graph of HomeValue is slightly right skewed.
ggplot(BostonHousing, aes(x = CrimePerCapita)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = CrimePerCapita, y = HomeValue)) +
geom_point()
The distribution of CrimePerCapita is heavily right-skewed. When plotting HomeValue with respect to CrimePerCapita, we can see a negative correlation. As CrimePerCapita increases, the HomeValue tends to decrease.
ggplot(BostonHousing, aes(x = Residential)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = Residential, y = HomeValue)) +
geom_point()
The distribution of Residential is right-skewed. When plotting HomeValue with respect to Residential, we observe a generally positive correlation, such that as the Residential proportion increases, the HomeValue increases as well.
ggplot(BostonHousing, aes(x = NonRetail)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = NonRetail, y = HomeValue)) +
geom_point()
The distribution of NonRetail is slightly left-skewed. When plotting HomeValue with respect to NonRetail, we observe a negative correlation such that as NonRetail proportion increases, HomeValue decreases.
ggplot(BostonHousing, aes(x = CharlesRiver)) + geom_bar(color ="black",fill="black")
ggplot (BostonHousing, aes (x = CharlesRiver, y = HomeValue)) +
geom_point()
The distribution of CharlesRiver is heavily right-skewed with the majority of data points having a value of ‘0’.
ggplot(BostonHousing, aes(x = N_Oxides)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = N_Oxides, y = HomeValue)) +
geom_point()
The distribution of N_OXides is right-skewed. When plotting HomeValue with respect to N_Oxides, we can observe a negative correlation such that as N_OXides increases, HomeValue decreases. This relationship is semi-intuitive since most individuals would prefer not to live in an area that is more heavily polluted.
ggplot(BostonHousing, aes(x = RoomsPerDwelling)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = RoomsPerDwelling, y = HomeValue)) +
geom_point()
The distribution of RoomsPerDwelling appears relatively normal. Looking at this distribution, the average number of rooms within these localities range between 5 to 7. When plotting HomeValue with respect to RoomsPerDwelling, we observe a positive correlation such that as RoomsPerDwelling increases, HomeValue increases as well. Most people would prefer houses with more rooms, however those typically would cost more.
ggplot(BostonHousing, aes(x = Pre40HouseAge)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = Pre40HouseAge, y = HomeValue)) +
geom_point()
The distribution of Pre40HouseAge is left-skewed. Intuitively, older houses cost less than newer houses. Thus we can observe a slight negative correlation when plotting HomeValue with respect to Pre40HouseAge.
ggplot(BostonHousing, aes(x = BusinessCenterProximity)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = BusinessCenterProximity, y = HomeValue)) +
geom_point()
The distribution of BusinessCenterProximity is right-skewed.Typically, the closer you are to a business center, the smaller the size of your house would be. This may be a reason why HomeValue is lower, the closer you are to a BusinessCenter. In contrast, the further away you are, the more land you are able to buy, thus the HomeValue would be greater to a certain extent.
# potentially remove.
ggplot(BostonHousing, aes(x = HighwayProximity)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = HighwayProximity, y = HomeValue)) +
geom_point()
ggplot(BostonHousing, aes(x = Tax)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = Tax, y = HomeValue)) +
geom_point()
The distribution of Tax is left-skewed. When plotting HomeValue with respect to Tax, we observe a slight negative correlation, such that as Property Tax rate goes up, HomeValue decreases.
ggplot(BostonHousing, aes(x = PupilPerTeacher)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = PupilPerTeacher, y = HomeValue)) +
geom_point()
The distribution of PupilPerTeacher is left-skewed. When plotting HomeValue with respect to PupilPerTeacher, we observe a negative correlation such that as PupilPerTeacher increases, HomeValue decreases. House prices typically are very driven by the quality of schools within the area. Good schools usually have low pupil-teacher ratio, thus this negative correlation makes sense.
ggplot(BostonHousing, aes(x = BlackIndex)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = BlackIndex, y = HomeValue)) +
geom_point()
The distribution of BlackIndex is left-skewed. When plotting HomeValue with respect to BlackIndex, we observe a positive correlation, such that as BlackIndex increases, HomeValue increases as well.
ggplot(BostonHousing, aes(x = LowStatus)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (BostonHousing, aes (x = LowStatus, y = HomeValue)) +
geom_point()
The distribution of LowStatus is slightly right-skewed. When plotting HomeValue with respect to LowStatus, we observe a negative correlation, such that as LowStatus percentage increases, HomeValue decreases.
cor_vars<-BostonHousing[,c("CrimePerCapita","Residential","NonRetail","CharlesRiver","N_Oxides",
"RoomsPerDwelling","Pre40HouseAge","BusinessCenterProximity",
"HighwayProximity","Tax","PupilPerTeacher","BlackIndex",
"LowStatus","HomeValue" )]
cor(cor_vars)
## CrimePerCapita Residential NonRetail CharlesRiver
## CrimePerCapita 1.00000000 -0.20046922 0.40658341 -0.055891582
## Residential -0.20046922 1.00000000 -0.53382819 -0.042696719
## NonRetail 0.40658341 -0.53382819 1.00000000 0.062938027
## CharlesRiver -0.05589158 -0.04269672 0.06293803 1.000000000
## N_Oxides 0.42097171 -0.51660371 0.76365145 0.091202807
## RoomsPerDwelling -0.21924670 0.31199059 -0.39167585 0.091251225
## Pre40HouseAge 0.35273425 -0.56953734 0.64477851 0.086517774
## BusinessCenterProximity -0.37967009 0.66440822 -0.70802699 -0.099175780
## HighwayProximity 0.62550515 -0.31194783 0.59512927 -0.007368241
## Tax 0.58276431 -0.31456332 0.72076018 -0.035586518
## PupilPerTeacher 0.28994558 -0.39167855 0.38324756 -0.121515174
## BlackIndex -0.38506394 0.17552032 -0.35697654 0.048788485
## LowStatus 0.45562148 -0.41299457 0.60379972 -0.053929298
## HomeValue -0.38830461 0.36044534 -0.48372516 0.175260177
## N_Oxides RoomsPerDwelling Pre40HouseAge
## CrimePerCapita 0.42097171 -0.21924670 0.35273425
## Residential -0.51660371 0.31199059 -0.56953734
## NonRetail 0.76365145 -0.39167585 0.64477851
## CharlesRiver 0.09120281 0.09125123 0.08651777
## N_Oxides 1.00000000 -0.30218819 0.73147010
## RoomsPerDwelling -0.30218819 1.00000000 -0.24026493
## Pre40HouseAge 0.73147010 -0.24026493 1.00000000
## BusinessCenterProximity -0.76923011 0.20524621 -0.74788054
## HighwayProximity 0.61144056 -0.20984667 0.45602245
## Tax 0.66802320 -0.29204783 0.50645559
## PupilPerTeacher 0.18893268 -0.35550149 0.26151501
## BlackIndex -0.38005064 0.12806864 -0.27353398
## LowStatus 0.59087892 -0.61380827 0.60233853
## HomeValue -0.42732077 0.69535995 -0.37695457
## BusinessCenterProximity HighwayProximity Tax
## CrimePerCapita -0.37967009 0.625505145 0.58276431
## Residential 0.66440822 -0.311947826 -0.31456332
## NonRetail -0.70802699 0.595129275 0.72076018
## CharlesRiver -0.09917578 -0.007368241 -0.03558652
## N_Oxides -0.76923011 0.611440563 0.66802320
## RoomsPerDwelling 0.20524621 -0.209846668 -0.29204783
## Pre40HouseAge -0.74788054 0.456022452 0.50645559
## BusinessCenterProximity 1.00000000 -0.494587930 -0.53443158
## HighwayProximity -0.49458793 1.000000000 0.91022819
## Tax -0.53443158 0.910228189 1.00000000
## PupilPerTeacher -0.23247054 0.464741179 0.46085304
## BlackIndex 0.29151167 -0.444412816 -0.44180801
## LowStatus -0.49699583 0.488676335 0.54399341
## HomeValue 0.24992873 -0.381626231 -0.46853593
## PupilPerTeacher BlackIndex LowStatus HomeValue
## CrimePerCapita 0.2899456 -0.38506394 0.4556215 -0.3883046
## Residential -0.3916785 0.17552032 -0.4129946 0.3604453
## NonRetail 0.3832476 -0.35697654 0.6037997 -0.4837252
## CharlesRiver -0.1215152 0.04878848 -0.0539293 0.1752602
## N_Oxides 0.1889327 -0.38005064 0.5908789 -0.4273208
## RoomsPerDwelling -0.3555015 0.12806864 -0.6138083 0.6953599
## Pre40HouseAge 0.2615150 -0.27353398 0.6023385 -0.3769546
## BusinessCenterProximity -0.2324705 0.29151167 -0.4969958 0.2499287
## HighwayProximity 0.4647412 -0.44441282 0.4886763 -0.3816262
## Tax 0.4608530 -0.44180801 0.5439934 -0.4685359
## PupilPerTeacher 1.0000000 -0.17738330 0.3740443 -0.5077867
## BlackIndex -0.1773833 1.00000000 -0.3660869 0.3334608
## LowStatus 0.3740443 -0.36608690 1.0000000 -0.7376627
## HomeValue -0.5077867 0.33346082 -0.7376627 1.0000000
trans<-cor(cor_vars)
melted_cormat <- melt(trans)
Visualize the correlation between all variables on a heat map
corrplot(cor(dplyr::select(BostonHousing, -CharlesRiver,-BlackIndex)))
HomeValue decreases with increase in CrimePerCapita (Medium),NonRetail(High), N_Oxides(Low), Pre40HouseAge(Low), HighwayProximity(Low), Tax(Low), PupilPerTeacher(High), and LowStatus(High). HomeValue increases with increase in Residential(Low) and RoomsPerDwelling(High)
Visualize the effect of variables on HomeValue
BostonHousing %>%
dplyr::select(c(CrimePerCapita, RoomsPerDwelling, Pre40HouseAge, HighwayProximity, Tax, LowStatus, HomeValue, NonRetail, N_Oxides, PupilPerTeacher, Residential)) %>%
melt(id.vars = 'HomeValue') %>%
ggplot(aes(x = value, y = HomeValue, color = variable)) +
geom_point(alpha = 0.7) +
stat_smooth(aes(color = 'black')) +
facet_wrap(~variable, scales = 'free', ncol = 2) +
labs(x = 'Variable Value', y = 'Median House Price ($1000s)') +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.5
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 13
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 2.9038e-31
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 156.25
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.5
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 13
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 2.9038e-31
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 156.25
The plots from the graph shows same correlation with the heat map.
mean(BostonHousing$HomeValue)
## [1] 22.53281
sd(BostonHousing$HomeValue)
## [1] 9.197104
Our mean should be close to zero and standard deviation close to 1. We will need to scale the data.
BostonHousing <- scale(BostonHousing,center = TRUE, scale = TRUE)
colMeans(BostonHousing) # faster version of apply(scaled.dat, 2, mean)
## CrimePerCapita Residential NonRetail
## -6.899468e-18 2.298337e-17 1.516683e-17
## CharlesRiver N_Oxides RoomsPerDwelling
## -3.510587e-18 -2.149412e-16 -1.058524e-16
## Pre40HouseAge BusinessCenterProximity HighwayProximity
## -1.645039e-16 1.144506e-16 4.651527e-17
## Tax PupilPerTeacher BlackIndex
## 1.906139e-17 -3.931034e-16 -1.155991e-16
## LowStatus HomeValue
## -7.012260e-17 -1.379311e-16
apply(BostonHousing, 2, sd)
## CrimePerCapita Residential NonRetail
## 1 1 1
## CharlesRiver N_Oxides RoomsPerDwelling
## 1 1 1
## Pre40HouseAge BusinessCenterProximity HighwayProximity
## 1 1 1
## Tax PupilPerTeacher BlackIndex
## 1 1 1
## LowStatus HomeValue
## 1 1
BostonHousing <- as.data.frame(BostonHousing)
mean(BostonHousing$HomeValue)
## [1] -1.374631e-16
sd(BostonHousing$HomeValue)
## [1] 1
#make this example reproducible
set.seed(1)
#use 70% of dataset as training set and 30% as test set
BH<- sample(c(TRUE, FALSE), nrow(BostonHousing), replace=TRUE, prob=c(0.7,0.3))
TrainSet <- BostonHousing[BH, ]
TestSet <- BostonHousing[!BH, ]
dim(TrainSet)
## [1] 360 14
dim(TestSet)
## [1] 146 14
ModelLR1<-lm(HomeValue ~ CrimePerCapita + Residential + NonRetail + CharlesRiver + N_Oxides + RoomsPerDwelling +
Pre40HouseAge + BusinessCenterProximity + HighwayProximity + Tax + PupilPerTeacher + BlackIndex + LowStatus
, data = TrainSet)
summary(ModelLR1)
##
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + Residential + NonRetail +
## CharlesRiver + N_Oxides + RoomsPerDwelling + Pre40HouseAge +
## BusinessCenterProximity + HighwayProximity + Tax + PupilPerTeacher +
## BlackIndex + LowStatus, data = TrainSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.70161 -0.29204 -0.05476 0.18336 2.97775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02980 0.02685 -1.110 0.267787
## CrimePerCapita -0.12613 0.03837 -3.287 0.001117 **
## Residential 0.13466 0.03954 3.405 0.000738 ***
## NonRetail 0.05211 0.05361 0.972 0.331675
## CharlesRiver 0.03430 0.02723 1.260 0.208589
## N_Oxides -0.20725 0.05641 -3.674 0.000277 ***
## RoomsPerDwelling 0.34335 0.03934 8.728 < 2e-16 ***
## Pre40HouseAge 0.02398 0.04939 0.486 0.627586
## BusinessCenterProximity -0.31349 0.05144 -6.095 2.92e-09 ***
## HighwayProximity 0.29367 0.07419 3.958 9.17e-05 ***
## Tax -0.25623 0.08311 -3.083 0.002212 **
## PupilPerTeacher -0.20690 0.03576 -5.786 1.61e-08 ***
## BlackIndex 0.12153 0.03236 3.755 0.000203 ***
## LowStatus -0.38857 0.04849 -8.014 1.72e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5054 on 346 degrees of freedom
## Multiple R-squared: 0.7546, Adjusted R-squared: 0.7454
## F-statistic: 81.84 on 13 and 346 DF, p-value: < 2.2e-16
Predicting the performance of out model on testing data set
predictedHomeValueLR1 <- predict(ModelLR1, TestSet)
TestSet['Predicted.HomeValueLR1'] <- predictedHomeValueLR1
pl1 <- TestSet %>%
ggplot(aes(HomeValue, Predicted.HomeValueLR1)) +
geom_point(alpha = 0.5) +
stat_smooth(aes(color = 'black')) +
xlab('Actual Home Value') +
ylab('Predicted Home Value')+
theme_bw()
ggplotly(pl1)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Assessing error rate: RMSE of our model
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
if(!require('MLmetrics')) {
install.packages('MLmetrics')
library('MLmetrics')
}
## Loading required package: MLmetrics
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
mse = MSE(TestSet$HomeValue, predictedHomeValueLR1)
mae = MAE(TestSet$HomeValue, predictedHomeValueLR1)
rmse = RMSE(TestSet$HomeValue, predictedHomeValueLR1)
r2 = R2(TestSet$HomeValue, predictedHomeValueLR1, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.3693966
## MSE: 0.3117648
## RMSE: 0.558359
## R-squared: 0.6252161
R-squared on TestSet for this model is 0.62, which shows the level of correlation between HomeValue and other variables.
RMSE score for our multiple regression model is 0.55, which is not that bad. Ideally, it should be between 0.2 to 0.5.
So, we will create a new model by eliminating the variables with P-values that are not statistically significant, i.e., Residential, NonRetail, CharlesRiver, Pre40HouseAge, Tax, and BlackIndex. By looking at plot of effect of variables on HomeValue, we can further eliminate PupilPerTeacher, and N_Oxides.
CrimePerCapita, RoomsPerDwelling, BusinessCenterProximity, HighwayProximity, and LowStatus that have statistically significant p-values.
ModelLR2<-lm(HomeValue ~ CrimePerCapita + RoomsPerDwelling + BusinessCenterProximity + HighwayProximity + LowStatus
, data = TrainSet)
summary(ModelLR2)
##
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + RoomsPerDwelling +
## BusinessCenterProximity + HighwayProximity + LowStatus, data = TrainSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0479 -0.3305 -0.0871 0.1849 3.1890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02661 0.03008 -0.885 0.376870
## CrimePerCapita -0.11065 0.04271 -2.591 0.009971 **
## RoomsPerDwelling 0.42981 0.04035 10.651 < 2e-16 ***
## BusinessCenterProximity -0.12017 0.03569 -3.367 0.000843 ***
## HighwayProximity -0.07191 0.04218 -1.705 0.089120 .
## LowStatus -0.47413 0.04821 -9.835 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5681 on 354 degrees of freedom
## Multiple R-squared: 0.6828, Adjusted R-squared: 0.6783
## F-statistic: 152.4 on 5 and 354 DF, p-value: < 2.2e-16
Predicting the performance of out model on testing data set
predictedHomeValue_LR2 <- predict(ModelLR2, TestSet)
TestSet['Predicted.HomeValueLR2'] <- predictedHomeValue_LR2
pl2 <- TestSet %>%
ggplot(aes(HomeValue, x=Predicted.HomeValueLR2)) +
geom_point(alpha = 0.5) +
stat_smooth(aes(color = 'black')) +
xlab('Actual Home Value') +
ylab('Predicted Home Value')+
theme_bw()
ggplotly(pl2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
We can see there a few outliers at HomeValue = 50.
Assessing error rate: RMSE of our model
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet$HomeValue, predictedHomeValue_LR2)
mae = MAE(TestSet$HomeValue, predictedHomeValue_LR2)
rmse = RMSE(TestSet$HomeValue, predictedHomeValue_LR2)
r2 = R2(TestSet$HomeValue, predictedHomeValue_LR2, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.4376023
## MSE: 0.4028163
## RMSE: 0.6346781
## R-squared: 0.4958962
The R-squared value for this model is 0.49 which which is worse than our previous model.
RMSE value is 0.63 which is also more than the previous model1. Model 2 obviously does not work.
So, to further improve the performance we can remove the outliers at HomeValue = 50, and eliminate features like BusinessCenterProximity and HighwayProximity. We will also create an interaction term between LowStatus and RoomsPerDwelling and see if it improves our adjusted R-squared.
df <- subset(BostonHousing, HomeValue != 50)
df['LowStatusC'] <- df$LowStatus - mean(df$LowStatus)
df['RoomsPerDwellingC'] <- df$RoomsPerDwelling - mean(df$RoomsPerDwelling)
df['RMLStat'] <- df$RoomsPerDwellingC * df$LowStatusC
#Split Data into TrainSet and TestSet
#make this example reproducible
set.seed(1)
#use 70% of dataset as training set and 30% as test set
BH2<- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
TrainSet2 <- df[BH2, ]
TestSet2 <- df[!BH2, ]
dim(TrainSet2)
## [1] 360 17
dim(TestSet2)
## [1] 146 17
ModelLR3<-lm(HomeValue ~ CrimePerCapita + RoomsPerDwelling + LowStatus + RMLStat
, data = TrainSet2)
summary(ModelLR3)
##
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + RoomsPerDwelling +
## LowStatus + RMLStat, data = TrainSet2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5382 -0.2703 -0.0604 0.2077 3.6385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.17881 0.02905 -6.154 2.04e-09 ***
## CrimePerCapita -0.11228 0.03202 -3.507 0.000512 ***
## RoomsPerDwelling 0.31032 0.03618 8.577 3.06e-16 ***
## LowStatus -0.57864 0.03952 -14.641 < 2e-16 ***
## RMLStat -0.25880 0.02241 -11.549 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4919 on 355 degrees of freedom
## Multiple R-squared: 0.7615, Adjusted R-squared: 0.7588
## F-statistic: 283.4 on 4 and 355 DF, p-value: < 2.2e-16
predictedHomeValue_LR3 <- predict(ModelLR3, TestSet2)
TestSet2['Predicted.HomeValueLR3'] <- predictedHomeValue_LR3
pl3 <- TestSet2 %>%
ggplot(aes(HomeValue, x=Predicted.HomeValueLR3)) +
geom_point(alpha = 0.5) +
stat_smooth(aes(color = 'black')) +
xlab('Actual Home Value') +
ylab('Predicted Home Value')+
theme_bw()
ggplotly(pl3)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet2$HomeValue, predictedHomeValue_LR3)
mae = MAE(TestSet2$HomeValue, predictedHomeValue_LR3)
rmse = RMSE(TestSet2$HomeValue, predictedHomeValue_LR3)
r2 = R2(TestSet2$HomeValue, predictedHomeValue_LR3, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.3454339
## MSE: 0.2801432
## RMSE: 0.5292855
## R-squared: 0.5946973
The RMSE score for interaction model after removing the outliers has improved from 0.55 in Model 1 to 0.52 but our R2 has has gotten worse from .62 to 0.59, which is not good.Our Best Model so far is Model 1 with RMSE: 0.558359 and R-squared: 0.6252161. We will further analyse using different models to see if we can improve on this.
Comparing models on their performance on the Train Set using Stargazer
stargazer(ModelLR1, ModelLR2, ModelLR3, title = "Comparing three Linear Regression Models", align = TRUE, type = 'text')
##
## Comparing three Linear Regression Models
## ==================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------
## HomeValue
## (1) (2) (3)
## --------------------------------------------------------------------------------------------------
## CrimePerCapita -0.126*** -0.111*** -0.112***
## (0.038) (0.043) (0.032)
##
## Residential 0.135***
## (0.040)
##
## NonRetail 0.052
## (0.054)
##
## CharlesRiver 0.034
## (0.027)
##
## N_Oxides -0.207***
## (0.056)
##
## RoomsPerDwelling 0.343*** 0.430*** 0.310***
## (0.039) (0.040) (0.036)
##
## Pre40HouseAge 0.024
## (0.049)
##
## BusinessCenterProximity -0.313*** -0.120***
## (0.051) (0.036)
##
## HighwayProximity 0.294*** -0.072*
## (0.074) (0.042)
##
## Tax -0.256***
## (0.083)
##
## PupilPerTeacher -0.207***
## (0.036)
##
## BlackIndex 0.122***
## (0.032)
##
## LowStatus -0.389*** -0.474*** -0.579***
## (0.048) (0.048) (0.040)
##
## RMLStat -0.259***
## (0.022)
##
## Constant -0.030 -0.027 -0.179***
## (0.027) (0.030) (0.029)
##
## --------------------------------------------------------------------------------------------------
## Observations 360 360 360
## R2 0.755 0.683 0.762
## Adjusted R2 0.745 0.678 0.759
## Residual Std. Error 0.505 (df = 346) 0.568 (df = 354) 0.492 (df = 355)
## F Statistic 81.838*** (df = 13; 346) 152.385*** (df = 5; 354) 283.412*** (df = 4; 355)
## ==================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
We can see from the table, adjusted R-squared value is highest in the interaction model However, our evaluation on Test Set says otherwise.
#Model SVM
#Fit SVR model and visualize using scatter plot
#Regression with SVM
ModelSVM <- svm(HomeValue~., data=TrainSet)
summary(ModelSVM)
##
## Call:
## svm(formula = HomeValue ~ ., data = TrainSet)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.07692308
## epsilon: 0.1
##
##
## Number of Support Vectors: 247
#Predict using SVM regression
PreditedHomeValueSVM = predict(ModelSVM, TestSet)
#Overlay SVM Predictions on Scatter Plot
x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueSVM, lwd="1", col="blue")
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet$HomeValue, PreditedHomeValueSVM)
mae = MAE(TestSet$HomeValue, PreditedHomeValueSVM)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueSVM)
r2 = R2(TestSet$HomeValue, PreditedHomeValueSVM, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.2514065
## MSE: 0.1814117
## RMSE: 0.4259245
## R-squared: 0.7020166
Our second model for Regression is SVM. We fitted our training set on the SVM model and then checked the model performance on the test set. Our best performance so far has been Model 1 with RMSE: 0.558359 and R-squared: 0.6252161. The performance from ModelSVM is RMSE: 0.4259245 and R-squared: 0.70201661, which is much better.
## Tuning SVR model by varying values of maximum allowable error and cost parameter
#Tune the SVM model
OptModelSVM=tune(svm, HomeValue~., data=TrainSet,ranges=list(epsilon=seq(0,1,0.1), cost=1:20))
#Print optimum value of parameters
print(OptModelSVM)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.1 8
##
## - best performance: 0.1655382
#Plot the perfrormance of SVM Regression model
plot(OptModelSVM)
The OptModelsvm has value of epsilon and cost at 0 and 8 respectively. The plot above visualizes the performance of each of the model. The legend on the right displays the value of Mean Square Error (MSE). MSE is defined as (RMSE)2 and is also a performance indicator.
## Select the best model and compute RMSE and R2
#Find out the best model
BestModelSVM=OptModelSVM$best.model
#Predict HomeValue using best model
PreditedHomeValueSVMBest=predict(BestModelSVM,TestSet)
#Calculate RMSE of the best model
rmseSVMBest = RMSE(TestSet$HomeValue, PreditedHomeValueSVMBest)
r2SVMBest = R2(TestSet$HomeValue, PreditedHomeValueSVMBest, form = "traditional")
cat( "RMSE:", rmseSVMBest, "\n",
"R-squared:", r2SVMBest)
## RMSE: 0.3232587
## R-squared: 0.8674123
#Overlay Predictions on Scatter Plot
x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueSVMBest, lwd="1", col="blue")
Our best performance so far has been The performance from Model 4: ModelSVM which gave us RMSE: 0.4259245 and R-squared: 0.70201661.
We further tuned our Hyperparameters for SVM and evaluated the performance for every combination of maximum allowable error of 0, 1 and 0.1 and cost parameter 1 to 20. This gave us the best model at epsilon of 0 and cost of 8 which gave us RMSE: 0.3232587 and R-squared: 0.8674123. The best model reduced our RMSE from 0.42 to 0.32 and increased our R square from 0.70 to 0.867 giving us the best model performance so far. We can clearly see from the 3 overlay graphs that the best model gives us the best fit.
# Model Decision Tree
#visualization
ModelDT <- rpart(HomeValue~., data=TrainSet)
summary(ModelDT)
## Call:
## rpart(formula = HomeValue ~ ., data = TrainSet)
## n= 360
##
## CP nsplit rel error xerror xstd
## 1 0.46088001 0 1.0000000 1.0050557 0.09858698
## 2 0.19257920 1 0.5391200 0.7060990 0.07980596
## 3 0.05762849 2 0.3465408 0.5029066 0.06647727
## 4 0.04155313 3 0.2889123 0.4333845 0.06153181
## 5 0.03628437 4 0.2473592 0.3920576 0.05691661
## 6 0.01771724 5 0.2110748 0.3141143 0.04706421
## 7 0.01689102 6 0.1933576 0.3024650 0.04701315
## 8 0.01118188 7 0.1764665 0.2767035 0.04356295
## 9 0.01000000 8 0.1652847 0.2722551 0.04224304
##
## Variable importance
## LowStatus RoomsPerDwelling NonRetail
## 26 20 12
## N_Oxides CrimePerCapita Residential
## 12 12 9
## Pre40HouseAge BusinessCenterProximity PupilPerTeacher
## 3 2 2
## BlackIndex
## 1
##
## Node number 1: 360 observations, complexity param=0.46088
## mean=0.001869466, MSE=1.000486
## left son=2 (211 obs) right son=3 (149 obs)
## Primary splits:
## LowStatus < -0.4254358 to the right, improve=0.4608800, (0 missing)
## RoomsPerDwelling < 1.033088 to the left, improve=0.4593216, (0 missing)
## NonRetail < -0.5702008 to the right, improve=0.2626432, (0 missing)
## PupilPerTeacher < 0.5517305 to the right, improve=0.2309303, (0 missing)
## N_Oxides < 0.8828701 to the right, improve=0.2157732, (0 missing)
## Surrogate splits:
## NonRetail < -0.5118948 to the right, agree=0.817, adj=0.557, (0 split)
## N_Oxides < -0.3080409 to the right, agree=0.806, adj=0.530, (0 split)
## RoomsPerDwelling < 0.183408 to the left, agree=0.797, adj=0.510, (0 split)
## CrimePerCapita < -0.4049939 to the right, agree=0.786, adj=0.483, (0 split)
## Residential < 0.1559169 to the left, agree=0.767, adj=0.436, (0 split)
##
## Node number 2: 211 observations, complexity param=0.05762849
## mean=-0.5687563, MSE=0.2613397
## left son=4 (54 obs) right son=5 (157 obs)
## Primary splits:
## LowStatus < 0.9210027 to the right, improve=0.3764114, (0 missing)
## CrimePerCapita < 0.3799476 to the right, improve=0.3574901, (0 missing)
## BusinessCenterProximity < -0.883119 to the left, improve=0.3407602, (0 missing)
## N_Oxides < 0.8828701 to the right, improve=0.2744873, (0 missing)
## Tax < 0.9449719 to the right, improve=0.2522448, (0 missing)
## Surrogate splits:
## BusinessCenterProximity < -1.046912 to the left, agree=0.834, adj=0.352, (0 split)
## CrimePerCapita < 0.6981747 to the right, agree=0.820, adj=0.296, (0 split)
## Pre40HouseAge < 1.075535 to the right, agree=0.806, adj=0.241, (0 split)
## RoomsPerDwelling < -1.048415 to the left, agree=0.796, adj=0.204, (0 split)
## BlackIndex < -3.39213 to the left, agree=0.763, adj=0.074, (0 split)
##
## Node number 3: 149 observations, complexity param=0.1925792
## mean=0.8099368, MSE=0.9331197
## left son=6 (103 obs) right son=7 (46 obs)
## Primary splits:
## RoomsPerDwelling < 1.034512 to the left, improve=0.4988837, (0 missing)
## LowStatus < -1.174624 to the right, improve=0.4182055, (0 missing)
## PupilPerTeacher < -1.711606 to the right, improve=0.2048885, (0 missing)
## N_Oxides < 0.1665976 to the left, improve=0.1708625, (0 missing)
## Pre40HouseAge < 0.8055423 to the left, improve=0.1474321, (0 missing)
## Surrogate splits:
## LowStatus < -1.194929 to the right, agree=0.779, adj=0.283, (0 split)
## PupilPerTeacher < -1.803987 to the right, agree=0.758, adj=0.217, (0 split)
## Residential < 3.264509 to the left, agree=0.725, adj=0.109, (0 split)
## NonRetail < -1.389401 to the right, agree=0.725, adj=0.109, (0 split)
## N_Oxides < 0.7275339 to the left, agree=0.718, adj=0.087, (0 split)
##
## Node number 4: 54 observations, complexity param=0.01118188
## mean=-1.103551, MSE=0.1811411
## left son=8 (39 obs) right son=9 (15 obs)
## Primary splits:
## N_Oxides < 0.4168615 to the right, improve=0.4117348, (0 missing)
## Tax < 0.8500374 to the right, improve=0.3422643, (0 missing)
## PupilPerTeacher < 0.5517305 to the right, improve=0.3392276, (0 missing)
## CrimePerCapita < 0.463346 to the right, improve=0.3372936, (0 missing)
## BusinessCenterProximity < -0.8860397 to the left, improve=0.2934819, (0 missing)
## Surrogate splits:
## BusinessCenterProximity < -0.7596215 to the left, agree=0.907, adj=0.667, (0 split)
## Tax < -0.06667466 to the right, agree=0.907, adj=0.667, (0 split)
## NonRetail < 0.4676467 to the right, agree=0.870, adj=0.533, (0 split)
## CrimePerCapita < -0.2447477 to the right, agree=0.852, adj=0.467, (0 split)
## PupilPerTeacher < 0.5517305 to the right, agree=0.778, adj=0.200, (0 split)
##
## Node number 5: 157 observations, complexity param=0.01771724
## mean=-0.3848141, MSE=0.156718
## left son=10 (65 obs) right son=11 (92 obs)
## Primary splits:
## LowStatus < 0.3286538 to the right, improve=0.2593529, (0 missing)
## CrimePerCapita < 0.1214127 to the right, improve=0.1947878, (0 missing)
## PupilPerTeacher < 0.4593494 to the right, improve=0.1654144, (0 missing)
## Pre40HouseAge < 0.5941661 to the right, improve=0.1591771, (0 missing)
## BlackIndex < -1.733219 to the left, improve=0.1525752, (0 missing)
## Surrogate splits:
## Pre40HouseAge < 0.8019898 to the right, agree=0.745, adj=0.385, (0 split)
## CrimePerCapita < 0.08039561 to the right, agree=0.688, adj=0.246, (0 split)
## N_Oxides < 0.5549381 to the right, agree=0.675, adj=0.215, (0 split)
## BusinessCenterProximity < -0.8009852 to the left, agree=0.669, adj=0.200, (0 split)
## BlackIndex < -0.3411367 to the left, agree=0.656, adj=0.169, (0 split)
##
## Node number 6: 103 observations, complexity param=0.04155313
## mean=0.3539748, MSE=0.3976233
## left son=12 (96 obs) right son=13 (7 obs)
## Primary splits:
## Pre40HouseAge < 0.9067897 to the left, improve=0.3654334, (0 missing)
## LowStatus < -1.122111 to the right, improve=0.3301865, (0 missing)
## BusinessCenterProximity < -0.8418265 to the right, improve=0.3225409, (0 missing)
## N_Oxides < 0.2960444 to the left, improve=0.2799519, (0 missing)
## NonRetail < 0.791974 to the left, improve=0.2392294, (0 missing)
## Surrogate splits:
## BusinessCenterProximity < -0.9559211 to the right, agree=0.981, adj=0.714, (0 split)
## N_Oxides < 0.2960444 to the left, agree=0.971, adj=0.571, (0 split)
## CrimePerCapita < -0.3030303 to the left, agree=0.961, adj=0.429, (0 split)
## NonRetail < 0.791974 to the left, agree=0.961, adj=0.429, (0 split)
## HighwayProximity < 0.7408293 to the left, agree=0.942, adj=0.143, (0 split)
##
## Node number 7: 46 observations, complexity param=0.03628437
## mean=1.830895, MSE=0.6242919
## left son=14 (23 obs) right son=15 (23 obs)
## Primary splits:
## RoomsPerDwelling < 1.640105 to the left, improve=0.45507970, (0 missing)
## LowStatus < -1.213134 to the right, improve=0.42065130, (0 missing)
## PupilPerTeacher < -1.573034 to the right, improve=0.24842000, (0 missing)
## Pre40HouseAge < 0.5106814 to the left, improve=0.19949120, (0 missing)
## CrimePerCapita < -0.3601142 to the left, improve=0.09966569, (0 missing)
## Surrogate splits:
## LowStatus < -1.213134 to the right, agree=0.761, adj=0.522, (0 split)
## CrimePerCapita < -0.4069924 to the left, agree=0.696, adj=0.391, (0 split)
## BlackIndex < 0.3804811 to the right, agree=0.696, adj=0.391, (0 split)
## PupilPerTeacher < -1.573034 to the right, agree=0.674, adj=0.348, (0 split)
## Pre40HouseAge < 0.4893661 to the left, agree=0.652, adj=0.304, (0 split)
##
## Node number 8: 39 observations
## mean=-1.272919, MSE=0.1167287
##
## Node number 9: 15 observations
## mean=-0.663195, MSE=0.08011764
##
## Node number 10: 65 observations
## mean=-0.6246655, MSE=0.1230763
##
## Node number 11: 92 observations
## mean=-0.2153539, MSE=0.1111246
##
## Node number 12: 96 observations, complexity param=0.01689102
## mean=0.2510421, MSE=0.1632389
## left son=24 (80 obs) right son=25 (16 obs)
## Primary splits:
## LowStatus < -1.083601 to the right, improve=0.38821680, (0 missing)
## RoomsPerDwelling < 0.4381698 to the left, improve=0.36974710, (0 missing)
## Tax < -0.8350514 to the right, improve=0.12863220, (0 missing)
## PupilPerTeacher < 0.3207778 to the right, improve=0.11255210, (0 missing)
## HighwayProximity < -0.8096011 to the left, improve=0.08848849, (0 missing)
## Surrogate splits:
## RoomsPerDwelling < 0.6303086 to the left, agree=0.875, adj=0.250, (0 split)
## Residential < 3.264509 to the left, agree=0.854, adj=0.125, (0 split)
## Tax < -1.119855 to the right, agree=0.844, adj=0.062, (0 split)
##
## Node number 13: 7 observations
## mean=1.765623, MSE=1.473981
##
## Node number 14: 23 observations
## mean=1.297882, MSE=0.1743252
##
## Node number 15: 23 observations
## mean=2.363908, MSE=0.5060535
##
## Node number 24: 80 observations
## mean=0.1384614, MSE=0.09443664
##
## Node number 25: 16 observations
## mean=0.8139457, MSE=0.1270178
#Visualizing the Decision tree
rpart.plot(ModelDT)
#Predict using Decision Tree
PreditedHomeValueDT = predict(ModelDT, TestSet)
#Overlay Decision Tree Predictions on Scatter Plot
x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueDT, lwd="1", col="blue")
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet$HomeValue, PreditedHomeValueDT)
mae = MAE(TestSet$HomeValue, PreditedHomeValueDT)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueDT)
r2 = R2(TestSet$HomeValue, PreditedHomeValueDT, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.3612928
## MSE: 0.2570004
## RMSE: 0.506952
## R-squared: 0.6261858
We further try to improve our performance using Decision Tree. However, it could not beat our best performance of SVM best Model.The best model has a low RMSE of 0.32 compared to 0.50 in the Decision Tree model and higher R square of 0.867 compared to 0.62 in the Decision Tree. SVM Best Model is still our best performer.
#check for important variables
ModelDT$variable.importance
## LowStatus RoomsPerDwelling NonRetail
## 225.639634 172.849836 108.569877
## N_Oxides CrimePerCapita Residential
## 107.997657 101.341823 80.714827
## Pre40HouseAge BusinessCenterProximity PupilPerTeacher
## 26.395078 21.954655 20.429869
## BlackIndex Tax HighwayProximity
## 7.731267 3.065188 2.138057
The regression tree can be pruned for better results. Pruning translates to trimming. Pruning overcomes the problem of overfitting. We will be trimming the leaf nodes, and this way reduce the size of the tree. Pruning a tree requires us to choose the best Complexity Parameter (CP) value. We can either refer to the CP table or explicitly ask for the best CP value. Best CP value has the lowest Cross validation error (xerror) value.
#Finding the best CP value
printcp(ModelDT) #Choose the best CP value based on the lowest xerror from the CP table.
##
## Regression tree:
## rpart(formula = HomeValue ~ ., data = TrainSet)
##
## Variables actually used in tree construction:
## [1] LowStatus N_Oxides Pre40HouseAge RoomsPerDwelling
##
## Root node error: 360.17/360 = 1.0005
##
## n= 360
##
## CP nsplit rel error xerror xstd
## 1 0.460880 0 1.00000 1.00506 0.098587
## 2 0.192579 1 0.53912 0.70610 0.079806
## 3 0.057628 2 0.34654 0.50291 0.066477
## 4 0.041553 3 0.28891 0.43338 0.061532
## 5 0.036284 4 0.24736 0.39206 0.056917
## 6 0.017717 5 0.21107 0.31411 0.047064
## 7 0.016891 6 0.19336 0.30246 0.047013
## 8 0.011182 7 0.17647 0.27670 0.043563
## 9 0.010000 8 0.16528 0.27226 0.042243
#From the CP table it is observed that 0.24181 is the lowest xerror value.
#However, the number of splits (nsplit) is 8. This has the same number of splits as our original tree.
#We will get the same result as the nsplits will be the same for the original and pruned tree.
#Hence, we should look another low xerror value.
#Prune the tree with best cp value (complexity parameter)
prunedtree <- prune(ModelDT, cp = 0.011182)
#Visualizing the pruned tree
rpart.plot(prunedtree)
#Checking the order of variable importance
prunedtree$variable.importance
## LowStatus RoomsPerDwelling NonRetail
## 225.6396339 172.8498358 106.4219133
## N_Oxides CrimePerCapita Residential
## 103.9702242 99.4623549 80.7148267
## Pre40HouseAge PupilPerTeacher BusinessCenterProximity
## 26.3950778 19.6243823 19.2697003
## BlackIndex HighwayProximity Tax
## 7.7312671 2.1380566 0.3802326
#performance of regression tree
predprune <- predict(prunedtree, TestSet)
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet$HomeValue, predprune )
mae = MAE(TestSet$HomeValue, predprune )
rmse = RMSE(TestSet$HomeValue, predprune )
r2 = R2(TestSet$HomeValue, predprune , form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.3760765
## MSE: 0.2818576
## RMSE: 0.5309026
## R-squared: 0.5823294
Pruning the tree didnt reduce our RMSE or increase R2, so our first tree is good. Sometimes pruning is effective. In our case, it wasn’t.
#Random Forest
set.seed(123)
ModelRF <- randomForest(HomeValue~., data=TrainSet, proximity=TRUE)
print(ModelRF)
##
## Call:
## randomForest(formula = HomeValue ~ ., data = TrainSet, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.1357942
## % Var explained: 86.43
summary(ModelRF)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 360 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 360 -none- numeric
## importance 13 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 129600 -none- numeric
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 360 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
#Predict using Random Forest
PreditedHomeValueRF = predict(ModelRF, TestSet)
#Overlay Random Forest Predictions on Scatter Plot
x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueRF, lwd="1", col="blue")
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.
mse = MSE(TestSet$HomeValue, PreditedHomeValueRF)
mae = MAE(TestSet$HomeValue, PreditedHomeValueRF)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueRF)
r2 = R2(TestSet$HomeValue, PreditedHomeValueRF, form = "traditional")
cat(" MAE:", mae, "\n", "MSE:", mse, "\n",
"RMSE:", rmse, "\n", "R-squared:", r2)
## MAE: 0.2353101
## MSE: 0.1292226
## RMSE: 0.3594754
## R-squared: 0.810815
Random Forest didn’t improve our performance either.It could not beat our best performance of SVM best Model.The best model has a low RMSE of 0.32 compared to 0.359 in the Random Forest model and higher R square of 0.867 compared to .81 in the Random Forest. SVM Best Model is still our best performer. Although, this Model also gave us a really good performance.
Conclusion:
To Conclude we used 8 different models to give us the best prediction on the Median HomeValue of various localities in Boston. 3 in Linear Regression 2 in SVM 2 in Decision Tree 1 in Random Forest We found the best Model to be SVM using a value of epsilon and cost at 0 and 8 respectively. It gave us an RMSE of 0.32 and R2 of 0.87.